This RMarkdown file contains the report of the data analysis done for the project on forecasting daily bike rental demand using time series models in R. It contains analysis such as data exploration, summary statistics and building the time series models. The final report was completed on Wed Apr 9 18:45:11 2025.
Data Description: This dataset contains the daily count of rental bike transactions between years 2011 and 2012 in Capital bikeshare system with the corresponding weather and seasonal information.
Data Source: https://archive.ics.uci.edu/ml/datasets/bike+sharing+dataset
Relevant Paper: Fanaee-T, Hadi, and Gama, Joao, ‘Event labeling combining ensemble detectors and background knowledge’, Progress in Artificial Intelligence (2013): pp. 1-15, Springer Berlin Heidelberg
# Install and load required packages
install.packages("timetk")
## Installing package into '/usr/local/lib/R/site-library'
## (as 'lib' is unspecified)
install.packages("tidyverse")
## Installing package into '/usr/local/lib/R/site-library'
## (as 'lib' is unspecified)
install.packages("lubridate")
## Installing package into '/usr/local/lib/R/site-library'
## (as 'lib' is unspecified)
install.packages("forecast")
## Installing package into '/usr/local/lib/R/site-library'
## (as 'lib' is unspecified)
install.packages("TTR")
## Installing package into '/usr/local/lib/R/site-library'
## (as 'lib' is unspecified)
install.packages("tseries")
## Installing package into '/usr/local/lib/R/site-library'
## (as 'lib' is unspecified)
library(timetk)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(TTR)
library(tseries)
# Load built-in dataset
data("bike_sharing_daily")
bike_data <- bike_sharing_daily
head(bike_data)
## # A tibble: 6 × 16
## instant dteday season yr mnth holiday weekday workingday weathersit
## <dbl> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 2011-01-01 1 0 1 0 6 0 2
## 2 2 2011-01-02 1 0 1 0 0 0 2
## 3 3 2011-01-03 1 0 1 0 1 1 1
## 4 4 2011-01-04 1 0 1 0 2 1 1
## 5 5 2011-01-05 1 0 1 0 3 1 1
## 6 6 2011-01-06 1 0 1 0 4 1 1
## # ℹ 7 more variables: temp <dbl>, atemp <dbl>, hum <dbl>, windspeed <dbl>,
## # casual <dbl>, registered <dbl>, cnt <dbl>
# Rebuild date column starting from "2011-01-01"
bike_data <- bike_data %>%
mutate(date = as.Date("2011-01-01") + days(0:(n() - 1)))
# Summary statistics
summary(bike_data)
## instant dteday season yr
## Min. : 1.0 Min. :2011-01-01 Min. :1.000 Min. :0.0000
## 1st Qu.:183.5 1st Qu.:2011-07-02 1st Qu.:2.000 1st Qu.:0.0000
## Median :366.0 Median :2012-01-01 Median :3.000 Median :1.0000
## Mean :366.0 Mean :2012-01-01 Mean :2.497 Mean :0.5007
## 3rd Qu.:548.5 3rd Qu.:2012-07-01 3rd Qu.:3.000 3rd Qu.:1.0000
## Max. :731.0 Max. :2012-12-31 Max. :4.000 Max. :1.0000
## mnth holiday weekday workingday
## Min. : 1.00 Min. :0.00000 Min. :0.000 Min. :0.000
## 1st Qu.: 4.00 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.000
## Median : 7.00 Median :0.00000 Median :3.000 Median :1.000
## Mean : 6.52 Mean :0.02873 Mean :2.997 Mean :0.684
## 3rd Qu.:10.00 3rd Qu.:0.00000 3rd Qu.:5.000 3rd Qu.:1.000
## Max. :12.00 Max. :1.00000 Max. :6.000 Max. :1.000
## weathersit temp atemp hum
## Min. :1.000 Min. :0.05913 Min. :0.07907 Min. :0.0000
## 1st Qu.:1.000 1st Qu.:0.33708 1st Qu.:0.33784 1st Qu.:0.5200
## Median :1.000 Median :0.49833 Median :0.48673 Median :0.6267
## Mean :1.395 Mean :0.49538 Mean :0.47435 Mean :0.6279
## 3rd Qu.:2.000 3rd Qu.:0.65542 3rd Qu.:0.60860 3rd Qu.:0.7302
## Max. :3.000 Max. :0.86167 Max. :0.84090 Max. :0.9725
## windspeed casual registered cnt
## Min. :0.02239 Min. : 2.0 Min. : 20 Min. : 22
## 1st Qu.:0.13495 1st Qu.: 315.5 1st Qu.:2497 1st Qu.:3152
## Median :0.18097 Median : 713.0 Median :3662 Median :4548
## Mean :0.19049 Mean : 848.2 Mean :3656 Mean :4504
## 3rd Qu.:0.23321 3rd Qu.:1096.0 3rd Qu.:4776 3rd Qu.:5956
## Max. :0.50746 Max. :3410.0 Max. :6946 Max. :8714
## date
## Min. :2011-01-01
## 1st Qu.:2011-07-02
## Median :2012-01-01
## Mean :2012-01-01
## 3rd Qu.:2012-07-01
## Max. :2012-12-31
# Correlation between temperature and rentals
cor(bike_data$temp, bike_data$cnt)
## [1] 0.627494
cor(bike_data$atemp, bike_data$cnt)
## [1] 0.6310657
# Mean and median temperature per season
bike_data %>%
group_by(season) %>%
summarise(mean_temp = mean(temp), median_temp = median(temp))
## # A tibble: 4 × 3
## season mean_temp median_temp
## <dbl> <dbl> <dbl>
## 1 1 0.298 0.286
## 2 2 0.544 0.562
## 3 3 0.706 0.715
## 4 4 0.423 0.409
# Monthly averages
bike_data %>%
mutate(month = month(date)) %>%
group_by(month) %>%
summarise(mean_temp = mean(temp), mean_hum = mean(hum), mean_wind = mean(windspeed), total_rentals = mean(cnt))
## # A tibble: 12 × 5
## month mean_temp mean_hum mean_wind total_rentals
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 0.236 0.586 0.206 2176.
## 2 2 0.299 0.567 0.216 2655.
## 3 3 0.391 0.588 0.223 3692.
## 4 4 0.470 0.588 0.234 4485.
## 5 5 0.595 0.689 0.183 5350.
## 6 6 0.684 0.576 0.185 5772.
## 7 7 0.755 0.598 0.166 5564.
## 8 8 0.709 0.638 0.173 5664.
## 9 9 0.616 0.715 0.166 5767.
## 10 10 0.485 0.694 0.175 5199.
## 11 11 0.369 0.625 0.184 4247.
## 12 12 0.324 0.666 0.177 3404.
# Boxplot of temperature by season
boxplot(temp ~ season, data = bike_data, main = "Temperature by Season")
# Interactive plot using timetk
bike_data %>%
plot_time_series(date, cnt, .interactive = TRUE, .plotly_slider = TRUE)
# Seasonal diagnostics
bike_data %>%
plot_seasonal_diagnostics(date, cnt)
# Anomaly diagnostics
bike_data %>%
plot_anomaly_diagnostics(date, cnt)
## frequency = 7 observations per 1 week
## trend = 92 observations per 3 months
# Convert to time series object
bike_ts <- ts(bike_data$cnt, frequency = 365)
# Clean anomalies and missing values
bike_clean <- tsclean(bike_ts)
# Simple Moving Average (SMA)
bike_sma <- SMA(bike_clean, n = 10)
plot(bike_sma, main = "Simple Moving Average (Order 10)")
# Simple Exponential Smoothing
bike_hw <- HoltWinters(bike_clean, beta = FALSE, gamma = FALSE)
plot(bike_hw)
# Decomposition
decomp <- decompose(bike_ts)
plot(decomp)
# Augmented Dickey-Fuller Test for stationarity
adf.test(bike_ts)
##
## Augmented Dickey-Fuller Test
##
## data: bike_ts
## Dickey-Fuller = -1.6351, Lag order = 9, p-value = 0.7327
## alternative hypothesis: stationary
# ACF and PACF plots
acf(bike_ts)
pacf(bike_ts)
# Differencing (if necessary)
bike_diff <- diff(bike_ts)
plot(bike_diff)
adf.test(bike_diff)
## Warning in adf.test(bike_diff): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: bike_diff
## Dickey-Fuller = -13.798, Lag order = 8, p-value = 0.01
## alternative hypothesis: stationary
# Auto ARIMA model
auto_fit <- auto.arima(bike_clean)
summary(auto_fit)
## Series: bike_clean
## ARIMA(1,0,3)(0,1,0)[365] with drift
##
## Coefficients:
## ar1 ma1 ma2 ma3 drift
## 0.9683 -0.5912 -0.1279 -0.0937 5.7116
## s.e. 0.0224 0.0571 0.0617 0.0576 0.8318
##
## sigma^2 = 986021: log likelihood = -3042.81
## AIC=6097.63 AICc=6097.86 BIC=6121.05
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 5.853004 697.8113 385.8648 -2.699883 9.189324 0.1694626
## ACF1
## Training set -0.003587809
# Manual ARIMA model example
manual_fit <- arima(bike_clean, order = c(1,1,1))
summary(manual_fit)
##
## Call:
## arima(x = bike_clean, order = c(1, 1, 1))
##
## Coefficients:
## ar1 ma1
## 0.3692 -0.8751
## s.e. 0.0437 0.0213
##
## sigma^2 estimated as 661725: log likelihood = -5928.18, aic = 11862.37
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 11.13247 812.9082 588.1202 -6.429274 19.42943 0.894064 0.01109983
# Residual diagnostics
shapiro.test(residuals(auto_fit))
##
## Shapiro-Wilk normality test
##
## data: residuals(auto_fit)
## W = 0.83801, p-value < 2.2e-16
acf(residuals(auto_fit))
pacf(residuals(auto_fit))
# Forecast the next 25 values
forecast_auto <- forecast(auto_fit, h = 25)
plot(forecast_auto, main = "Forecast with Auto ARIMA")
forecast_manual <- forecast(manual_fit, h = 25)
plot(forecast_manual, main = "Forecast with Manual ARIMA")